
cities <- fread("raw_data/simplemaps_uscities_basicv1.76/uscities.csv")[state_id != "PR" &
                                                                          population > 100000,]$city


more <- read.csv("raw_data/city_abbr.csv")$name

cities <- c(cities, more)
#####CLEAN CITY NAMES AS CORPUS#################################


#clean English search words through corpus
cities <- Corpus(VectorSource(as.vector(cities))) #create corpus
cities <- tm_map(cities, removeWords, stopwords("english")) #remove stop words
cities <- tm_map(cities, content_transformer(removePunctuation)) #remove punctuation
cities <- tm_map(cities, content_transformer(removeNumbers)) #remove numbers
cities <- tm_map(cities,  content_transformer(tolower)) #all lower-case
cities <- tm_map(cities, content_transformer(stripWhitespace)) #remove white space that could be identified as a word
inspect(cities) #check  

cities <- data.frame(text = sapply(cities, as.character), stringsAsFactors = FALSE)

cities_names <- cities$text 


#city tokenizer
bigram_tokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), 1:3), paste, collapse = " "), use.names = FALSE) #up to 3 indvidual names within a city title
  
}



############TEXT MINING FUNCTION############################

text_mine <- function(file_name) {
  
  df <- rbind(fread(paste0(file_name, "1.csv")),
              fread(paste0(file_name, "2.csv")), fill = T)
  
  backend <- df %>% 
    mutate(id_num = row_number()) 
  
  backend_orig <- backend
  
  backend$text <- removePunctuation(replace_curly_quote(backend$text))#remove quotation marks that tm misses
  corpus <- VCorpus(VectorSource(as.vector(backend$text))) #need VCorpus here to work with tokenizer later on
  
  corpus <- tm_map(corpus, removeWords, stopwords("english")) #remove stop words
  corpus <- tm_map(corpus, content_transformer(removePunctuation)) #remove punctuation
  corpus <- tm_map(corpus, content_transformer(removeNumbers)) #remove numbers
  corpus <- tm_map(corpus, content_transformer(tolower)) #all lower-case
  corpus <- tm_map(corpus, content_transformer(stripWhitespace))
  
  dtm <- DocumentTermMatrix(corpus, control=list(tokenizer = bigram_tokenizer, dictionary = cities_names, wordLengths=c(1,Inf)))
  
  dtm_flag <- as.data.frame(as.matrix(dtm), stringsAsFactors=False)
  
  
  dtm_flag <- dtm_flag %>% 
    mutate(flag = rowSums(dtm_flag),
           id_num = row_number()) %>% 
    select_if(colSums(.) != 0)
  
  dtm_flag <- dtm_flag |> 
    pivot_longer(colnames(dtm_flag)[c(1:(ncol(dtm_flag) - 2))]) |> 
    filter(value != 0) |> 
    group_by(id_num) |> 
    mutate(city_num = paste0("city_", row_number())) |> 
    select(id_num, city_num, name) |> 
    pivot_wider(id_cols = id_num, names_from = city_num, values_from = name)
  
  
  flagged_posts <- left_join(backend_orig, dtm_flag, by = "id_num")  
  
  assign(paste0("flagged_posts"),flagged_posts,envir=parent.frame())
  
  row_print <- nrow(filter(flagged_posts, !is.na(city_1)))
  
  print(paste0(row_print, " (", (row_print/nrow(flagged_posts))*100, " ", "percent)", " ", "posts have been flagged out of", " ", nrow(backend))) 
}


text_mine(file_name = "raw_data/fraud_tweets") #insert name of file

saveRDS(flagged_posts, "temp/tweets_w_cities.rds")
